Prep data

Load necessary packages

setwd("~/Desktop/working-with-lyle/Formality_Project")#set our WD 
if (!require("pacman")) install.packages("pacman") #run this if you don't have pacman 
library(pacman)
pacman::p_load(tidyverse,rlang, zoo, lubridate, plotrix, ggpubr, caret, broom, kableExtra, reactable, install = T) 
#use pacman to load packages quickly 

Define Aesthetics for graphs and stuff

palette_map = c("#3B9AB2", "#EBCC2A", "#F21A00")
palette_condition = c("#ee9b00", "#bb3e03", "#005f73")

plot_aes = theme_classic() +
  theme(legend.position = "top",
        legend.text = element_text(size = 12),
        text = element_text(size = 16, family = "Futura Medium"),
        axis.text = element_text(color = "black"),
        axis.line = element_line(colour = "black"),
        axis.ticks.y = element_blank())

Define Table Functions

 table_model = function(model_data) {
   model_data %>% 
     tidy() %>% 
     rename("SE" = std.error,
            "t" = statistic,
            "p" = p.value) %>%
     kable() %>% 
     kableExtra::kable_styling()
 }

Load data and do a quick clean of missing data and prize winners

df <- read_csv('books_FK.csv') #read in the data

Tidy the data

That is, one row per year, per variable

 tidy_df <- df %>%
   group_by(ORIG_PUBL_DATE) %>% ###grouping by the year 
   summarise_at(vars("readability", "grade_level"),  funs(mean, std.error),) #pulling the means and SEs for our variables of interest

# Get the mean values for the year 1933
year_means <- tidy_df %>%
  filter(ORIG_PUBL_DATE == 1933) 

#create centered variablles
tidy_df$readability_centered <- tidy_df$readability_mean - 85.97
tidy_df$grade_level_centered <- tidy_df$grade_level_mean - 4.996

Flesch-Kincaid Description

Flesch-Kincaid Ease of Readability: higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read.

The Flesch–Kincaid Grade Level Score: presents a score as a U.S. grade level, making it easier for teachers, parents, librarians, and others to judge the readability level of various books and texts.

Corpus Summary Stats

The following corpus consists of 599 books, from 599 authors, ranging from 1933 to 2020.

Please note that these analyses were conducted on the folder titled text-plain!

Raw count of Books

df %>%
  select(filename) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)

Number of authors per year

auth_year <- df %>%
  select(ORIG_PUBL_DATE,AUTH_LAST) %>%
  unique() %>%
  group_by(ORIG_PUBL_DATE) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)
 auth_year

Sex Distribution

auth_sex <- df %>%
   select(AUTH_GENDER,filename) %>%
   unique() %>%
   group_by(AUTH_GENDER) %>%
   dplyr::summarize(n = n()) %>%
   reactable::reactable(striped = TRUE)
 auth_sex

Readability by Gender

M and F = male AND Female authors wrote the book; not an aggregate of males and females.

 Read_sex <- df %>%
   select(AUTH_GENDER,readability) %>%
   unique() %>%
   group_by(AUTH_GENDER) %>%
   dplyr::summarize(mean = mean(readability)) %>%
   reactable::reactable(striped = TRUE)
 Read_sex

Grade Level by Gender

Grade_sex <- df %>%
   select(AUTH_GENDER,grade_level) %>%
   unique() %>%
   group_by(AUTH_GENDER) %>%
   dplyr::summarize(mean = mean(grade_level)) %>%
   reactable::reactable(striped = TRUE)
Grade_sex

Flesch-Kincaid Graphs

Please see attached files for the graphs if needed.

Plot the Smoothed Data

#Plot our smoothed data 

#we are using Non-tidy data here to capture the individual variation 

#readability

readability_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=readability, group=1)) +
  ggtitle("Readability") +
  geom_point(color = "dodgerblue3", alpha = 0.5) + 
  geom_smooth(method = "loess", span = 0.70 )+ 
  plot_aes +
  labs(x = "Year", y = 'Ease of Readability') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=100,label="
             estimate = 0.0796  
             p-value < 0.001
           
           ", size = 3.5)

#grade level
grade_level_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=grade_level, group=1)) +
  ggtitle("Reading Grade Level") +
  geom_point(color = "dodgerblue3", alpha = 0.5) + 
  geom_smooth(method = "loess", span = 0.70 )+ 
  plot_aes +
  labs(x = "Year", y = 'Reading Grade Level') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=12,label="
             estimate = -0.0199     
             p-value < 0.001
           
           ", size = 3.5)

smooth_graphs <- ggpubr::ggarrange(readability_smooth,grade_level_smooth,ncol=1, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(smooth_graphs,
                top = text_grob("Smooth Readability Graphs",  color = "black", face = "bold", size = 20),
                bottom = text_grob(
                "Note. Horizontal shading represents Standard Error."
                                   , color = "Black",
                                   hjust = 1.05, x = 1, face = "italic", size = 12))

Plotting the smoothed data by year

readability_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=readability_mean, group=1)) +
  ggtitle("Readability") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.90 )+ 
  plot_aes +
  labs(x = "Year", y = 'Ease of Readability') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=87,label="
             estimate = 0.0745  
             p-value < 0.001
           
           ", size = 3.5)

grade_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=grade_level_mean, group=1)) +
  ggtitle("Grade Level") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.90 )+ 
  plot_aes +
  labs(x = "Year", y = 'Grade Level Score') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=8,label="
             estimate = -0.0192 
             p-value < 0.001
           
           ", size = 3.5)
tidy_smooth_graphs <- ggpubr::ggarrange(readability_smooth_tidy,grade_smooth_tidy, ncol=1, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(tidy_smooth_graphs,
                top = text_grob("Smooth Flesch-Kincaid",  color = "black", face = "bold", size = 20),
                bottom = text_grob(
                "Note. Horizontal shading represents Standard Error.
                Estimates displayed are from mean centered analyses (data centered on 1933)"
                                   , color = "Black",
                                   hjust = 1, x = 1, face = "italic", size = 12))

Raw Data by Year

Readability <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=readability_mean, group=1)) +
   geom_line(colour = "dodgerblue3") +
   geom_ribbon(aes(ymin=readability_mean-readability_std.error, ymax=readability_mean+readability_std.error), alpha=0.2) +
   ggtitle("Readbility") +
   plot_aes + 
   labs(x = "Year", y = 'Ease of Readbility') + 
   theme(axis.text.x=element_text(angle=45, hjust=1), 
         plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
   theme(plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
   theme(axis.text=element_text(size=16),
         axis.title=element_text(size=20,face="bold"))+
   theme(plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
   theme(axis.text=element_text(size = 14),
         axis.title=element_text(size = 20,face="bold")) 

grade_level <-  ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=grade_level_mean, group=1)) +
   geom_line(colour = "dodgerblue3") +
   geom_ribbon(aes(ymin=grade_level_mean-grade_level_std.error, ymax=grade_level_mean+grade_level_std.error), alpha=0.2) +
   ggtitle("Grade Level") +
   plot_aes + 
   labs(x = "Year", y = 'Flesch-Kincaid Grade Level') + 
   theme(axis.text.x=element_text(angle=45, hjust=1), 
         plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
   theme(plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
   theme(axis.text=element_text(size=16),
         axis.title=element_text(size=20,face="bold"))+
   theme(plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
   theme(axis.text=element_text(size = 14),
         axis.title=element_text(size = 20,face="bold")) 

#raw graphs
raw_graphs <- ggpubr::ggarrange(Readability, grade_level, ncol=1, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(raw_graphs,
                top = text_grob("Raw Flesch-Kincaid Graphs (grouped by year)",  color = "black", face = "bold", size = 20),
                bottom = text_grob("Note. Horizontal shading represents Standard Error. "
                                   , color = "Black",
                                   hjust = 1.05, x = 1, face = "italic", size = 16))

Build Simple Regression Models

Models presented in order: Raw data, aggregated by year, centered on 1857

Ease of Readability

#Raw Data
Readability_RAW <- lm(readability ~ ORIG_PUBL_DATE, data = df)

#Tidy Data
Readability_TIDY <- lm(readability_mean ~ ORIG_PUBL_DATE, data = tidy_df)

Readability_centered <- lm(readability_centered ~ORIG_PUBL_DATE, data = tidy_df)

table_model(Readability_RAW)
term estimate SE t p
(Intercept) -73.6851 26.1862 -2.814 0.0051
ORIG_PUBL_DATE 0.0796 0.0132 6.021 0.0000
table_model(Readability_TIDY)
term estimate SE t p
(Intercept) -63.6450 24.9021 -2.556 0.0124
ORIG_PUBL_DATE 0.0745 0.0126 5.912 0.0000
table_model(Readability_centered)
term estimate SE t p
(Intercept) -149.6150 24.9021 -6.008 0
ORIG_PUBL_DATE 0.0745 0.0126 5.912 0

Grade Level Reading

#Raw Data
Grade_RAW <- lm(grade_level ~ ORIG_PUBL_DATE, data = df)

#Tidy Data
Grade_TIDY <- lm(grade_level_mean ~ ORIG_PUBL_DATE, data = tidy_df)

Grade_centered <- lm(grade_level_centered ~ORIG_PUBL_DATE, data = tidy_df)


table_model(Grade_RAW)
term estimate SE t p
(Intercept) 44.6362 5.7299 7.790 0
ORIG_PUBL_DATE -0.0199 0.0029 -6.881 0
table_model(Grade_TIDY)
term estimate SE t p
(Intercept) 43.1663 5.3006 8.144 0
ORIG_PUBL_DATE -0.0192 0.0027 -7.144 0
table_model(Grade_centered)
term estimate SE t p
(Intercept) 38.1703 5.3006 7.201 0
ORIG_PUBL_DATE -0.0192 0.0027 -7.144 0